import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
from collections import Counter
import re
import itertools
import string
import collections
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import sklearn.cluster as cluster
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, multilabel_confusion_matrix, confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. from pandas import MultiIndex, Int64Index
df1 =pd.read_csv('mbti_1.csv', delimiter=',') #csv file is seperated by comma so we use delimiter as comma
nRow, nCol = df1.shape # The shape of the data is a tuple including rows and columns
print(f'There are {nRow} rows and {nCol} columns')
There are 8675 rows and 2 columns
df1.head() # showing the first 5 rows
| type | posts | |
|---|---|---|
| 0 | INFJ | 'http://www.youtube.com/watch?v=qsXHcwe3krw|||... |
| 1 | ENTP | 'I'm finding the lack of me in these posts ver... |
| 2 | INTP | 'Good one _____ https://www.youtube.com/wat... |
| 3 | INTJ | 'Dear INTP, I enjoyed our conversation the o... |
| 4 | ENTJ | 'You're fired.|||That's another silly misconce... |
df1.info() #the full information of each column and it shows that we don't have any null data
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8675 entries, 0 to 8674 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 type 8675 non-null object 1 posts 8675 non-null object dtypes: object(2) memory usage: 135.7+ KB
#preview of first data
df1.loc[0,'posts'] # the posts are divided by |||
"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments https://www.youtube.com/watch?v=iz7lE1g4XM4 sportscenter not top ten plays https://www.youtube.com/watch?v=uCdfze1etec pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8 http://www.youtube.com/watch?v=u8ejam5DP3E On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~ http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389 84390 http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...|||Welcome and stuff.|||http://playeressence.com/wp-content/uploads/2013/08/RED-red-the-pokemon-master-32560474-450-338.jpg Game. Set. Match.|||Prozac, wellbrutin, at least thirty minutes of moving your legs (and I don't mean moving them while sitting in your same desk chair), weed in moderation (maybe try edibles as a healthier alternative...|||Basically come up with three items you've determined that each type (or whichever types you want to do) would more than likely use, given each types' cognitive functions and whatnot, when left by...|||All things in moderation. Sims is indeed a video game, and a good one at that. Note: a good one at that is somewhat subjective in that I am not completely promoting the death of any given Sim...|||Dear ENFP: What were your favorite video games growing up and what are your now, current favorite video games? :cool:|||https://www.youtube.com/watch?v=QyPqT8umzmY|||It appears to be too late. :sad:|||There's someone out there for everyone.|||Wait... I thought confidence was a good thing.|||I just cherish the time of solitude b/c i revel within my inner world more whereas most other time i'd be workin... just enjoy the me time while you can. Don't worry, people will always be around to...|||Yo entp ladies... if you're into a complimentary personality,well, hey.|||... when your main social outlet is xbox live conversations and even then you verbally fatigue quickly.|||http://www.youtube.com/watch?v=gDhy7rdfm14 I really dig the part from 1:46 to 2:50|||http://www.youtube.com/watch?v=msqXffgh7b8|||Banned because this thread requires it of me.|||Get high in backyard, roast and eat marshmellows in backyard while conversing over something intellectual, followed by massages and kisses.|||http://www.youtube.com/watch?v=Mw7eoU3BMbE|||http://www.youtube.com/watch?v=4V2uYORhQOk|||http://www.youtube.com/watch?v=SlVmgFQQ0TI|||Banned for too many b's in that sentence. How could you! Think of the B!|||Banned for watching movies in the corner with the dunces.|||Banned because Health class clearly taught you nothing about peer pressure.|||Banned for a whole host of reasons!|||http://www.youtube.com/watch?v=IRcrv41hgz4|||1) Two baby deer on left and right munching on a beetle in the middle. 2) Using their own blood, two cavemen diary today's latest happenings on their designated cave diary wall. 3) I see it as...|||a pokemon world an infj society everyone becomes an optimist|||49142|||http://www.youtube.com/watch?v=ZRCEq_JFeFM|||http://discovermagazine.com/2012/jul-aug/20-things-you-didnt-know-about-deserts/desert.jpg|||http://oyster.ignimgs.com/mediawiki/apis.ign.com/pokemon-silver-version/d/dd/Ditto.gif|||http://www.serebii.net/potw-dp/Scizor.jpg|||Not all artists are artists because they draw. It's the idea that counts in forming something of your own... like a signature.|||Welcome to the robot ranks, person who downed my self-esteem cuz I'm not an avid signature artist like herself. :proud:|||Banned for taking all the room under my bed. Ya gotta learn to share with the roaches.|||http://www.youtube.com/watch?v=w8IgImn57aQ|||Banned for being too much of a thundering, grumbling kind of storm... yep.|||Ahh... old high school music I haven't heard in ages. http://www.youtube.com/watch?v=dcCRUPCdB1w|||I failed a public speaking class a few years ago and I've sort of learned what I could do better were I to be in that position again. A big part of my failure was just overloading myself with too...|||I like this person's mentality. He's a confirmed INTJ by the way. http://www.youtube.com/watch?v=hGKLI-GEc6M|||Move to the Denver area and start a new life for myself.'"
fig = plt.gcf()
fig.set_size_inches(20, 5)
sns.countplot(x="type", data = df1) #show counts of each type
<AxesSubplot:xlabel='type', ylabel='count'>
grouped_data = df1.groupby(['type']).size().reset_index()
grouped_data.columns = ['type','Count']
grouped_data = grouped_data.sort_values('Count', ascending = False)
fig = px.bar(grouped_data, x = 'type', y = 'Count', title = 'Distribution Of Each Types')
fig.show() # show counts of each type in descending order
df = df1.copy()
def var_row(row):
l = []
for i in row.split('|||'):
l.append(len(i.split()))
return np.var(l)
df['words_per_comment'] = df['posts'].apply(lambda x: len(x.split())/50)
df['variance_of_word_counts'] = df['posts'].apply(lambda x: var_row(x))
plt.figure(figsize=(15,10))
sns.swarmplot("type", "words_per_comment", data=df)
<AxesSubplot:xlabel='type', ylabel='words_per_comment'>
df2 = df1[~df1['type'].isin(['ESFJ','ESFP','ESTJ','ESTP'])]
df2['http_per_comment'] = df2['posts'].apply(lambda x: x.count('http')/50)
df2['qm_per_comment'] = df2['posts'].apply(lambda x: x.count('?')/50)
df2.head()
| type | posts | http_per_comment | qm_per_comment | |
|---|---|---|---|---|
| 0 | INFJ | 'http://www.youtube.com/watch?v=qsXHcwe3krw|||... | 0.48 | 0.36 |
| 1 | ENTP | 'I'm finding the lack of me in these posts ver... | 0.20 | 0.10 |
| 2 | INTP | 'Good one _____ https://www.youtube.com/wat... | 0.10 | 0.24 |
| 3 | INTJ | 'Dear INTP, I enjoyed our conversation the o... | 0.04 | 0.22 |
| 4 | ENTJ | 'You're fired.|||That's another silly misconce... | 0.12 | 0.20 |
print(df2.groupby('type').agg({'http_per_comment': 'mean'}))
print(df2.groupby('type').agg({'qm_per_comment': 'mean'}))
http_per_comment
type
ENFJ 0.053263
ENFP 0.050459
ENTJ 0.053160
ENTP 0.048263
INFJ 0.065878
INFP 0.075426
INTJ 0.063593
INTP 0.070767
ISFJ 0.070602
ISFP 0.088339
ISTJ 0.061171
ISTP 0.081009
qm_per_comment
type
ENFJ 0.213053
ENFP 0.227259
ENTJ 0.240000
ENTP 0.220964
INFJ 0.208966
INFP 0.202533
INTJ 0.214849
INTP 0.221580
ISFJ 0.200964
ISFP 0.216384
ISTJ 0.214927
ISTP 0.221602
plt.figure(figsize=(15,10))
sns.jointplot("variance_of_word_counts", "words_per_comment", data=df, kind="hex")
<seaborn.axisgrid.JointGrid at 0x2044811e7f0>
<Figure size 1080x720 with 0 Axes>
def plot_jointplot(mbti_type, axs, titles):
df_1 = df[df['type'] == mbti_type]
sns.jointplot("variance_of_word_counts", "words_per_comment", data=df_1, kind="hex", ax = axs, title = titles)
plt.figure(figsize=(24, 5))
i = df['type'].unique()
k = 0
for m in range(1,3):
for n in range(1,7):
df_1 = df[df['type'] == i[k]]
sns.jointplot("variance_of_word_counts", "words_per_comment", data=df_1, kind="hex" )
plt.title(i[k])
k+=1
plt.show()
<Figure size 1728x360 with 0 Axes>
df['http_per_comment'] = df['posts'].apply(lambda x: x.count('http')/50)
df['music_per_comment'] = df['posts'].apply(lambda x: x.count('music')/50)
df['question_per_comment'] = df['posts'].apply(lambda x: x.count('?')/50)
df['img_per_comment'] = df['posts'].apply(lambda x: x.count('jpg')/50)
df['excl_per_comment'] = df['posts'].apply(lambda x: x.count('!')/50)
df['ellipsis_per_comment'] = df['posts'].apply(lambda x: x.count('...')/50)
plt.figure(figsize=(15,10))
sns.jointplot(x='words_per_comment', y='http_per_comment', data=df, kind='hex')
<seaborn.axisgrid.JointGrid at 0x2044891db50>
<Figure size 1080x720 with 0 Axes>
i = df['type'].unique()
k = 0
for m in range(0,2):
for n in range(0,6):
df_2 = df[df['type'] == i[k]]
sns.jointplot(x='words_per_comment', y='http_per_comment', data=df_2, kind="hex")
plt.title(i[k])
k+=1
plt.figure(figsize=(15,10))
sns.violinplot(x='type', y='words_per_comment', data=df, inner=None, color='lightgray')
sns.stripplot(x='type', y='words_per_comment', data=df, size=4, jitter=True)
plt.ylabel('Words per comment')
plt.show()
df["length_posts"] = df["posts"].apply(len)
sns.distplot(df["length_posts"]).set_title("Distribution of Lengths of all 50 Posts")
Text(0.5, 1.0, 'Distribution of Lengths of all 50 Posts')
fig, ax = plt.subplots(len(df1['type'].unique()), sharex=True, figsize=(15,len(df1['type'].unique())))
k = 0
for i in df1['type'].unique():
df_4 = df1[df1['type'] == i]
wordcloud = WordCloud(max_words=1628,relative_scaling=1,normalize_plurals=False).generate(df_4['posts'].to_string())
plt.subplot(4,4,k+1)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(i)
ax[k].axis("off")
k+=1
words = list(df["posts"].apply(lambda x: x.split()))
words = [x for y in words for x in y]
Counter(words).most_common(40)
[('I', 387957),
('to', 290168),
('the', 270699),
('a', 230918),
('and', 219498),
('of', 177853),
('is', 128804),
('you', 128750),
('that', 127221),
('in', 117263),
('my', 104561),
('it', 93101),
('for', 83057),
('have', 79784),
('with', 77131),
('but', 74729),
('be', 69317),
('are', 65034),
('like', 61390),
('not', 59496),
('an', 59020),
("I'm", 57339),
('on', 57062),
('was', 56146),
('me', 55488),
('as', 53310),
('this', 52601),
('just', 48292),
('about', 46305),
('think', 46229),
('or', 45724),
("don't", 44821),
('so', 42935),
('your', 40918),
('do', 40867),
('what', 37746),
('at', 37566),
('can', 37535),
('if', 37042),
('people', 35546)]
def preprocess_text(df, remove_special=True):
texts = df['posts'].copy()
labels = df['type'].copy()
df["posts"] = df["posts"].apply(lambda x: re.sub(r'https?:\/\/.*?[\s+]', '', x.replace("|"," ") + " "))
df["posts"] = df["posts"].apply(lambda x: re.sub(r'[\.+]', ".",x))
df["posts"] = df["posts"].apply(lambda x: re.sub(r'[^\w\s]','',x))
df["posts"] = df["posts"].apply(lambda x: re.sub(r'[^a-zA-Z\s]','',x))
df["posts"] = df["posts"].apply(lambda x: x.lower())
df["posts"] = df["posts"].apply(lambda x: re.sub(r'([a-z])\1{2,}[\s|\w]*','',x))
df["posts"] = df["posts"].apply(lambda x: re.sub(r'(\b\w{0,3})?\b','',x))
df["posts"] = df["posts"].apply(lambda x: re.sub(r'(\b\w{30,1000})?\b','',x))
if remove_special:
pers_types = ['INFP' ,'INFJ', 'INTP', 'INTJ', 'ENTP', 'ENFP', 'ISTP' ,'ISFP' ,'ENTJ', 'ISTJ','ENFJ', 'ISFJ' ,'ESTP', 'ESFP' ,'ESFJ' ,'ESTJ']
pers_types = [p.lower() for p in pers_types]
p = re.compile("(" + "|".join(pers_types) + ")")
return df
new_df = preprocess_text(df1)
min_words = 15
print("Before : Number of posts", len(new_df))
new_df["no. of. words"] = new_df["posts"].apply(lambda x: len(re.findall(r'\w+', x)))
new_df = new_df[new_df["no. of. words"] >= min_words]
print("After : Number of posts", len(new_df))
Before : Number of posts 8675 After : Number of posts 8436
enc = LabelEncoder()
new_df['type of encoding'] = enc.fit_transform(new_df['type'])
target = new_df['type of encoding']
new_df.head(15)
| type | posts | no. of. words | type of encoding | |
|---|---|---|---|---|
| 0 | INFJ | enfp intj moments sportscenter plays... | 344 | 8 |
| 1 | ENTP | finding lack these posts very alarming ... | 638 | 3 |
| 2 | INTP | good course which know thats bles... | 215 | 11 |
| 3 | INTJ | dear intp enjoyed conversation other es... | 611 | 10 |
| 4 | ENTJ | youre fired thats another silly misconceptio... | 315 | 2 |
| 5 | INTJ | science perfect scientist claims that ... | 189 | 10 |
| 6 | INFJ | cant draw nails haha those were done pro... | 775 | 8 |
| 7 | INTJ | tend build collection things desktop th... | 118 | 10 |
| 8 | INFJ | sure thats good question distinction betwe... | 420 | 8 |
| 9 | INTP | this position where have actually pe... | 106 | 11 |
| 10 | INFJ | time parents were fighting over dads affair... | 847 | 8 |
| 11 | ENFJ | went through break some months were... | 267 | 0 |
| 12 | INFJ | santagato entp enfj entp sure typ... | 457 | 8 |
| 13 | INTJ | fair enough thats want look like stated... | 887 | 10 |
| 14 | INTP | basically this cheezburgr very fond ... | 535 | 11 |
nltk.download('stopwords')
print(stopwords.words('english'))
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\ali\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
vect = CountVectorizer(stop_words='english')
train = vect.fit_transform(new_df["posts"])
train.shape
(8436, 107654)
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, stratify=target, random_state=42)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))
(6748, 107654) (6748,) (1688, 107654) (1688,)
accuracies = {}
random_forest = RandomForestClassifier(n_estimators=100, random_state = 1)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
accuracies['Random Forest'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 39.10%
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
Y_pred = xgb.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
accuracies['XG Boost'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
[13:20:41] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. Accuracy: 56.16%
sgd = SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, y_train)
Y_pred = sgd.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
accuracies['Gradient Descent'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 48.40%
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
accuracies['Logistic Regression'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 51.60%
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2) # n_neighbors means k
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
predictions = [round(value) for value in Y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
accuracies['KNN'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
#try to find best k value
scoreList = []
for i in range(1,20):
knn2 = KNeighborsClassifier(n_neighbors = i) # n_neighbors means k
knn2.fit(X_train, y_train)
scoreList.append(knn2.score(X_test, y_test))
plt.plot(range(1,20), scoreList)
plt.xticks(np.arange(1,20,1))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()
acc = max(scoreList)*100
print("Maximum KNN Score is {:.2f}%".format(acc))
Accuracy: 15.40%
Maximum KNN Score is 26.07%
from sklearn.svm import SVC
svm = SVC(random_state = 1)
svm.fit(X_train, y_train)
Y_pred = svm.predict(X_test)
predictions = [round(value) for value in Y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
accuracies['SVM'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 51.90%
pd.DataFrame.from_dict(accuracies, orient='index', columns=['Accuracies(%)'])
| Accuracies(%) | |
|---|---|
| Random Forest | 39.099526 |
| XG Boost | 56.161137 |
| Gradient Descent | 48.400474 |
| Logistic Regression | 51.599526 |
| KNN | 15.402844 |
| SVM | 51.895735 |
colors = ["purple", "green", "orange", "magenta","#CFC60E","#0FBBAE"]
sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette=colors)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3, stratify=target, random_state=42)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))
(5905, 107654) (5905,) (2531, 107654) (2531,)
accuracies = {}
random_forest = RandomForestClassifier(n_estimators=100, random_state = 1)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
accuracies['Random Forest'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 39.15%
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
Y_pred = xgb.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
accuracies['XG Boost'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
[13:27:46] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. Accuracy: 56.74%
sgd = SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, y_train)
Y_pred = sgd.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
accuracies['Gradient Descent'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 48.79%
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
predictions = [round(value) for value in Y_pred]
accuracy = accuracy_score(y_test, predictions)
accuracies['Logistic Regression'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 51.96%
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2) # n_neighbors means k
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
predictions = [round(value) for value in Y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
accuracies['KNN'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
#try to find best k value
scoreList = []
for i in range(1,20):
knn2 = KNeighborsClassifier(n_neighbors = i) # n_neighbors means k
knn2.fit(X_train, y_train)
scoreList.append(knn2.score(X_test, y_test))
plt.plot(range(1,20), scoreList)
plt.xticks(np.arange(1,20,1))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()
acc = max(scoreList)*100
print("Maximum KNN Score is {:.2f}%".format(acc))
Accuracy: 15.92%
Maximum KNN Score is 26.20%
from sklearn.svm import SVC
svm = SVC(random_state = 1)
svm.fit(X_train, y_train)
Y_pred = svm.predict(X_test)
predictions = [round(value) for value in Y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
accuracies['SVM'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 51.44%
pd.DataFrame.from_dict(accuracies, orient='index', columns=['Accuracies(%)'])
| Accuracies(%) | |
|---|---|
| Random Forest | 39.154484 |
| XG Boost | 56.736468 |
| Gradient Descent | 48.794943 |
| Logistic Regression | 51.955749 |
| KNN | 15.922560 |
| SVM | 51.442118 |
colors = ["purple", "green", "orange", "magenta","#CFC60E","#0FBBAE"]
sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette=colors)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.4, stratify=target, random_state=42)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))
(5061, 107654) (5061,) (3375, 107654) (3375,)
accuracies = {}
#Random Forest
random_forest = RandomForestClassifier(n_estimators=100, random_state = 1)
random_forest.fit(X_train, y_train)
# make predictions for test data
Y_pred = random_forest.predict(X_test)
predictions = [round(value) for value in Y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
accuracies['Random Forest'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 39.94%
#XG boost Classifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
Y_pred = xgb.predict(X_test)
predictions = [round(value) for value in Y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
accuracies['XG Boost'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
[13:35:56] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior. Accuracy: 57.01%
#Gradient Descent
sgd = SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, y_train)
Y_pred = sgd.predict(X_test)
predictions = [round(value) for value in Y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
accuracies['Gradient Descent'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 48.71%
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
predictions = [round(value) for value in Y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
accuracies['Logistic Regression'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 51.23%
#KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2) # n_neighbors means k
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
predictions = [round(value) for value in Y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
accuracies['KNN'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
#try to find best k value
scoreList = []
for i in range(1,20):
knn2 = KNeighborsClassifier(n_neighbors = i) # n_neighbors means k
knn2.fit(X_train, y_train)
scoreList.append(knn2.score(X_test, y_test))
plt.plot(range(1,20), scoreList)
plt.xticks(np.arange(1,20,1))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()
acc = max(scoreList)*100
print("Maximum KNN Score is {:.2f}%".format(acc))
Accuracy: 17.21%
Maximum KNN Score is 25.42%
from sklearn.svm import SVC
svm = SVC(random_state = 1)
svm.fit(X_train, y_train)
Y_pred = svm.predict(X_test)
predictions = [round(value) for value in Y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
accuracies['SVM'] = accuracy* 100.0
print("Accuracy: %.2f%%" % (accuracy * 100.0))
Accuracy: 50.40%
pd.DataFrame.from_dict(accuracies, orient='index', columns=['Accuracies(%)'])
| Accuracies(%) | |
|---|---|
| Random Forest | 39.940741 |
| XG Boost | 57.007407 |
| Gradient Descent | 48.711111 |
| Logistic Regression | 51.229630 |
| KNN | 17.214815 |
| SVM | 50.400000 |
colors = ["purple", "green", "orange", "magenta","#CFC60E","#0FBBAE"]
sns.set_style("whitegrid")
plt.figure(figsize=(16,5))
plt.yticks(np.arange(0,100,10))
plt.ylabel("Accuracy %")
plt.xlabel("Algorithms")
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette=colors)
plt.show()
data = pd.read_csv("mbti_1.csv")
data
| type | posts | |
|---|---|---|
| 0 | INFJ | 'http://www.youtube.com/watch?v=qsXHcwe3krw|||... |
| 1 | ENTP | 'I'm finding the lack of me in these posts ver... |
| 2 | INTP | 'Good one _____ https://www.youtube.com/wat... |
| 3 | INTJ | 'Dear INTP, I enjoyed our conversation the o... |
| 4 | ENTJ | 'You're fired.|||That's another silly misconce... |
| ... | ... | ... |
| 8670 | ISFP | 'https://www.youtube.com/watch?v=t8edHB_h908||... |
| 8671 | ENFP | 'So...if this thread already exists someplace ... |
| 8672 | INTP | 'So many questions when i do these things. I ... |
| 8673 | INFP | 'I am very conflicted right now when it comes ... |
| 8674 | INFP | 'It has been too long since I have been on per... |
8675 rows × 2 columns
def get_types(row):
t=row['type']
I = 0; N = 0
T = 0; J = 0
if t[0] == 'I': I = 1
elif t[0] == 'E': I = 0
else: print('I-E not found')
if t[1] == 'N': N = 1
elif t[1] == 'S': N = 0
else: print('N-S not found')
if t[2] == 'T': T = 1
elif t[2] == 'F': T = 0
else: print('T-F not found')
if t[3] == 'J': J = 1
elif t[3] == 'P': J = 0
else: print('J-P not found')
return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J })
data = data.join(data.apply (lambda row: get_types (row),axis=1))
data.head(5)
| type | posts | IE | NS | TF | JP | |
|---|---|---|---|---|---|---|
| 0 | INFJ | 'http://www.youtube.com/watch?v=qsXHcwe3krw|||... | 1 | 1 | 0 | 1 |
| 1 | ENTP | 'I'm finding the lack of me in these posts ver... | 0 | 1 | 1 | 0 |
| 2 | INTP | 'Good one _____ https://www.youtube.com/wat... | 1 | 1 | 1 | 0 |
| 3 | INTJ | 'Dear INTP, I enjoyed our conversation the o... | 1 | 1 | 1 | 1 |
| 4 | ENTJ | 'You're fired.|||That's another silly misconce... | 0 | 1 | 1 | 1 |
print ("Introversion (I):\t", data['IE'].value_counts()[0])
print ("Extroversion (E):\t", data['IE'].value_counts()[1])
print ("Intuition (N) :\t\t", data['NS'].value_counts()[0])
print ("Sensing (S):\t\t", data['NS'].value_counts()[1])
print ("Thinking (T) :\t\t", data['TF'].value_counts()[0])
print ("Feeling (F):\t\t", data['TF'].value_counts()[1])
print ("Judging (J) :\t\t", data['JP'].value_counts()[0])
print ("Perceiving (P):\t\t", data['JP'].value_counts()[1])
Introversion (I): 1999 Extroversion (E): 6676 Intuition (N) : 1197 Sensing (S): 7478 Thinking (T) : 4694 Feeling (F): 3981 Judging (J) : 5241 Perceiving (P): 3434
#Plotting the distribution of each personality type indicator
N = 4
bottom = (data['IE'].value_counts()[0], data['NS'].value_counts()[0], data['TF'].value_counts()[0], data['JP'].value_counts()[0])
top = (data['IE'].value_counts()[1], data['NS'].value_counts()[1], data['TF'].value_counts()[1], data['JP'].value_counts()[1])
ind = np.arange(N) # the x locations for the groups
# the width of the bars
width = 0.7 # or len(x) can also be used here
p1 = plt.bar(ind, bottom, width, label="I, N, T, F")
p2 = plt.bar(ind, top, width, bottom=bottom, label="E, S, F, P")
plt.title('Distribution accoss types indicators')
plt.ylabel('Count')
plt.xticks(ind, ('I / E', 'N / S', 'T / F', 'J / P',))
plt.legend()
plt.show()
data[['IE','NS','TF','JP']].corr()
| IE | NS | TF | JP | |
|---|---|---|---|---|
| IE | 1.000000 | -0.045899 | -0.069573 | 0.161939 |
| NS | -0.045899 | 1.000000 | -0.080954 | 0.014922 |
| TF | -0.069573 | -0.080954 | 1.000000 | -0.004673 |
| JP | 0.161939 | 0.014922 | -0.004673 | 1.000000 |
cmap = plt.cm.RdBu
corr = data[['IE','NS','TF','JP']].corr()
plt.figure(figsize=(12,10))
plt.title('Features Correlation Heatmap', size=15)
sns.heatmap(corr, cmap=cmap, annot=True, linewidths=1)
<AxesSubplot:title={'center':'Features Correlation Heatmap'}>
lemmatiser = WordNetLemmatizer()
# Remove the stop words for speed
useless_words = stopwords.words("english")
# Remove these from the posts
unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
unique_type_list = [x.lower() for x in unique_type_list]
# Or we can use Label Encoding (as above) of this unique personality type indicator list
# from sklearn.preprocessing import LabelEncoder
# unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
# 'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
# lab_encoder = LabelEncoder().fit(unique_type_list)
# Splitting the MBTI personality into 4 letters and binarizing it
b_Pers = {'I':0, 'E':1, 'N':0, 'S':1, 'F':0, 'T':1, 'J':0, 'P':1}
b_Pers_list = [{0:'I', 1:'E'}, {0:'N', 1:'S'}, {0:'F', 1:'T'}, {0:'J', 1:'P'}]
def translate_personality(personality):
# transform mbti to binary vector
return [b_Pers[l] for l in personality]
#To show result output for personality prediction
def translate_back(personality):
# transform binary vector to mbti personality
s = ""
for i, l in enumerate(personality):
s += b_Pers_list[i][l]
return s
list_personality_bin = np.array([translate_personality(p) for p in data.type])
print("Binarize MBTI list: \n%s" % list_personality_bin)
Binarize MBTI list: [[0 0 0 0] [1 0 1 1] [0 0 1 1] ... [0 0 1 1] [0 0 0 1] [0 0 0 1]]
nltk.download('wordnet')
nltk.download('omw-1.4')
def pre_process_text(data, remove_stop_words=True, remove_mbti_profiles=True):
list_personality = []
list_posts = []
len_data = len(data)
i=0
for row in data.iterrows():
# check code working
# i+=1
# if (i % 500 == 0 or i == 1 or i == len_data):
# print("%s of %s rows" % (i, len_data))
#Remove and clean comments
posts = row[1].posts
#Remove url links
temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', posts)
#Remove Non-words - keep only words
temp = re.sub("[^a-zA-Z]", " ", temp)
# Remove spaces > 1
temp = re.sub(' +', ' ', temp).lower()
#Remove multiple letter repeating words
temp = re.sub(r'([a-z])\1{2,}[\s|\w]*', '', temp)
#Remove stop words
if remove_stop_words:
temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in useless_words])
else:
temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
#Remove MBTI personality words from posts
if remove_mbti_profiles:
for t in unique_type_list:
temp = temp.replace(t,"")
# transform mbti to binary vector
type_labelized = translate_personality(row[1].type) #or use lab_encoder.transform([row[1].type])[0]
list_personality.append(type_labelized)
# the cleaned data temp is passed here
list_posts.append(temp)
# returns the result
list_posts = np.array(list_posts)
list_personality = np.array(list_personality)
return list_posts, list_personality
list_posts, list_personality = pre_process_text(data, remove_stop_words=True, remove_mbti_profiles=True)
print("Example :")
print("\nPost before preprocessing:\n\n", data.posts[0])
print("\nPost after preprocessing:\n\n", list_posts[0])
print("\nMBTI before preprocessing:\n\n", data.type[0])
print("\nMBTI after preprocessing:\n\n", list_personality[0])
[nltk_data] Downloading package wordnet to [nltk_data] C:\Users\ali\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package omw-1.4 to [nltk_data] C:\Users\ali\AppData\Roaming\nltk_data... [nltk_data] Package omw-1.4 is already up-to-date!
Example :
Post before preprocessing:
'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments https://www.youtube.com/watch?v=iz7lE1g4XM4 sportscenter not top ten plays https://www.youtube.com/watch?v=uCdfze1etec pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8 http://www.youtube.com/watch?v=u8ejam5DP3E On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~ http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389 84390 http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...|||Welcome and stuff.|||http://playeressence.com/wp-content/uploads/2013/08/RED-red-the-pokemon-master-32560474-450-338.jpg Game. Set. Match.|||Prozac, wellbrutin, at least thirty minutes of moving your legs (and I don't mean moving them while sitting in your same desk chair), weed in moderation (maybe try edibles as a healthier alternative...|||Basically come up with three items you've determined that each type (or whichever types you want to do) would more than likely use, given each types' cognitive functions and whatnot, when left by...|||All things in moderation. Sims is indeed a video game, and a good one at that. Note: a good one at that is somewhat subjective in that I am not completely promoting the death of any given Sim...|||Dear ENFP: What were your favorite video games growing up and what are your now, current favorite video games? :cool:|||https://www.youtube.com/watch?v=QyPqT8umzmY|||It appears to be too late. :sad:|||There's someone out there for everyone.|||Wait... I thought confidence was a good thing.|||I just cherish the time of solitude b/c i revel within my inner world more whereas most other time i'd be workin... just enjoy the me time while you can. Don't worry, people will always be around to...|||Yo entp ladies... if you're into a complimentary personality,well, hey.|||... when your main social outlet is xbox live conversations and even then you verbally fatigue quickly.|||http://www.youtube.com/watch?v=gDhy7rdfm14 I really dig the part from 1:46 to 2:50|||http://www.youtube.com/watch?v=msqXffgh7b8|||Banned because this thread requires it of me.|||Get high in backyard, roast and eat marshmellows in backyard while conversing over something intellectual, followed by massages and kisses.|||http://www.youtube.com/watch?v=Mw7eoU3BMbE|||http://www.youtube.com/watch?v=4V2uYORhQOk|||http://www.youtube.com/watch?v=SlVmgFQQ0TI|||Banned for too many b's in that sentence. How could you! Think of the B!|||Banned for watching movies in the corner with the dunces.|||Banned because Health class clearly taught you nothing about peer pressure.|||Banned for a whole host of reasons!|||http://www.youtube.com/watch?v=IRcrv41hgz4|||1) Two baby deer on left and right munching on a beetle in the middle. 2) Using their own blood, two cavemen diary today's latest happenings on their designated cave diary wall. 3) I see it as...|||a pokemon world an infj society everyone becomes an optimist|||49142|||http://www.youtube.com/watch?v=ZRCEq_JFeFM|||http://discovermagazine.com/2012/jul-aug/20-things-you-didnt-know-about-deserts/desert.jpg|||http://oyster.ignimgs.com/mediawiki/apis.ign.com/pokemon-silver-version/d/dd/Ditto.gif|||http://www.serebii.net/potw-dp/Scizor.jpg|||Not all artists are artists because they draw. It's the idea that counts in forming something of your own... like a signature.|||Welcome to the robot ranks, person who downed my self-esteem cuz I'm not an avid signature artist like herself. :proud:|||Banned for taking all the room under my bed. Ya gotta learn to share with the roaches.|||http://www.youtube.com/watch?v=w8IgImn57aQ|||Banned for being too much of a thundering, grumbling kind of storm... yep.|||Ahh... old high school music I haven't heard in ages. http://www.youtube.com/watch?v=dcCRUPCdB1w|||I failed a public speaking class a few years ago and I've sort of learned what I could do better were I to be in that position again. A big part of my failure was just overloading myself with too...|||I like this person's mentality. He's a confirmed INTJ by the way. http://www.youtube.com/watch?v=hGKLI-GEc6M|||Move to the Denver area and start a new life for myself.'
Post after preprocessing:
moment sportscenter top ten play prank life changing experience life repeat today may perc experience immerse last thing friend posted facebook committing suicide next day rest peace hello sorry hear distress natural relationship perfection time every moment existence try figure hard time time growth welcome stuff game set match prozac wellbrutin least thirty minute moving leg mean moving sitting desk chair weed moderation maybe try edible healthier alternative basically come three item determined type whichever type want would likely use given type cognitive function whatnot left thing moderation sims indeed video game good one note good one somewhat subjective completely promoting death given sim dear favorite video game growing current favorite video game cool appears late sad someone everyone wait thought confidence good thing cherish time solitude b c revel within inner world whereas time workin enjoy time worry people always around yo lady complimentary personality well hey main social outlet xbox live conversation even verbally fatigue quickly really dig part banned thread requires get high backyard roast eat marshmellows backyard conversing something intellectual followed massage kiss banned many b sentence could think b banned watching movie corner dunce banned health class clearly taught nothing peer pressure banned whole host reason two baby deer left right munching beetle middle using blood two caveman diary today latest happening designated cave diary wall see pokemon world society everyone becomes optimist artist artist draw idea count forming something like signature welcome robot rank person downed self esteem cuz avid signature artist like proud banned taking room bed ya gotta learn share roach banned much thundering grumbling kind storm yep ahh old high school music heard age failed public speaking class year ago sort learned could better position big part failure overloading like person mentality confirmed way move denver area start new life
MBTI before preprocessing:
INFJ
MBTI after preprocessing:
[0 0 0 0]
nRow, nCol = list_personality.shape
print(f'No. of posts = {nRow} and No. of Personalities = {nCol} ')
No. of posts = 8675 and No. of Personalities = 4
# Vectorizing the database posts to a matrix of token counts for the model
cntizer = CountVectorizer(analyzer="word",
max_features=1000,
max_df=0.75,
min_df=0.15)
# the feature should be made of word n-gram
# Learn the vocabulary dictionary and return term-document matrix
print("Using CountVectorizer :")
X_cnt = cntizer.fit_transform(list_posts)
#The enumerate object yields pairs containing a count and a value (useful for obtaining an indexed list)
feature_names = list(enumerate(cntizer.get_feature_names()))
print("10 feature names can be seen below")
print(feature_names[0:10])
# For the Standardization or Feature Scaling Stage :-
# Transform the count matrix to a normalized tf or tf-idf representation
tfizer = TfidfTransformer()
# Learn the idf vector (fit) and transform a count matrix to a tf-idf representation
print("\nUsing Tf-idf :")
print("Now the dataset size is as below")
X_tfidf = tfizer.fit_transform(X_cnt).toarray()
print(X_tfidf.shape)
Using CountVectorizer : 10 feature names can be seen below [(0, 'able'), (1, 'absolutely'), (2, 'act'), (3, 'actually'), (4, 'advice'), (5, 'age'), (6, 'ago'), (7, 'agree'), (8, 'almost'), (9, 'alone')] Using Tf-idf : Now the dataset size is as below (8675, 362)
# #counting top 10 words
# reverse_dic = {}
# for key in cntizer.vocabulary_:
# reverse_dic[cntizer.vocabulary_[key]] = key
# top_10 = np.asarray(np.argsort(np.sum(X_cnt, axis=0))[0,-10:][0, ::-1]).flatten()
# [reverse_dic[v] for v in top_50]
personality_type = [ "IE: Introversion (I) / Extroversion (E)", "NS: Intuition (N) / Sensing (S)",
"FT: Feeling (F) / Thinking (T)", "JP: Judging (J) / Perceiving (P)" ]
for l in range(len(personality_type)):
print(personality_type[l])
IE: Introversion (I) / Extroversion (E) NS: Intuition (N) / Sensing (S) FT: Feeling (F) / Thinking (T) JP: Judging (J) / Perceiving (P)
print("X: 1st posts in tf-idf representation\n%s" % X_tfidf[0])
X: 1st posts in tf-idf representation [0. 0. 0. 0. 0. 0.09319853 0.08124713 0. 0. 0. 0. 0. 0. 0. 0.05193588 0. 0. 0. 0. 0. 0.06118994 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.06331272 0.08149441 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.19070865 0. 0.05715397 0. 0. 0.08505294 0. 0.08601182 0.08784519 0.10562002 0. 0. 0. 0.05902795 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.08120096 0. 0. 0. 0.0506553 0. 0.06775466 0.13611624 0. 0. 0. 0.14317014 0. 0. 0. 0. 0. 0. 0.18266816 0. 0. 0. 0. 0. 0. 0.0969914 0. 0. 0. 0. 0. 0. 0.05141923 0. 0.08121634 0. 0.33694926 0. 0. 0. 0. 0. 0. 0. 0. 0.14447234 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.06708402 0. 0. 0.09202151 0.0969914 0. 0.09416993 0.15420378 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.06475308 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.06390403 0.07029411 0. 0. 0.09119001 0.07335237 0.19731788 0. 0. 0.1543941 0.09206361 0. 0.07917029 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.06058368 0. 0.06892013 0.0629845 0. 0.0574619 0. 0. 0. 0. 0. 0. 0.17640246 0. 0. 0. 0.088428 0.04643524 0.08253358 0. 0. 0. 0. 0. 0.06831129 0.09414754 0. 0. 0. 0.07353664 0. 0. 0. 0. 0.08018563 0. 0. 0. 0. 0. 0.13875143 0. 0. 0.11123509 0. 0.07080244 0. 0. 0. 0.08562507 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.06837488 0. 0. 0.06753657 0. 0. 0. 0.05526021 0.09891488 0. 0. 0. 0. 0.07264896 0. 0.04894597 0. 0. 0. 0. 0.07637215 0. 0. 0.0955635 0. 0. 0. 0. 0. 0. 0. 0.08015577 0.05319054 0.09809769 0. 0. 0.07818396 0.08585308 0. 0.07476537 0. 0. 0. 0. 0. 0.07898229 0. 0. 0.09276587 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.05369298 0.06206231 0. 0.1763648 0. 0. 0. 0. 0. 0. 0. 0.12166005 0. 0. 0.12764052 0.15363641 0. 0. 0.06952892 0. 0.09496263 0. 0. 0.2750595 0. 0.04856738 0. 0. 0.09964178 0.04727121 0. 0. 0.18896965 0.04734612 0. 0. 0. 0.0831464 0. 0. 0. 0. 0. 0. 0. 0.13185703 0. 0. 0. 0. 0. 0.05557359 0. 0. ]
print("For MBTI personality type : %s" % translate_back(list_personality[0,:]))
print("Y : Binarized MBTI 1st row: %s" % list_personality[0,:])
For MBTI personality type : INFJ Y : Binarized MBTI 1st row: [0 0 0 0]
# Posts in tf-idf representation
X = X_tfidf
#Random Forest model for MBTI dataset
# Individually training each mbti personlity type
for l in range(len(personality_type)):
Y = list_personality[:,l]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
# fit model on training data
model = RandomForestClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))
IE: Introversion (I) / Extroversion (E) Accuracy: 77.18% NS: Intuition (N) / Sensing (S) Accuracy: 86.21% FT: Feeling (F) / Thinking (T) Accuracy: 66.35% JP: Judging (J) / Perceiving (P) Accuracy: 62.58%
#XGBoost model for MBTI dataset
# Individually training each mbti personlity type
for l in range(len(personality_type)):
Y = list_personality[:,l]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))
[13:43:33] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. IE: Introversion (I) / Extroversion (E) Accuracy: 76.22% [13:43:41] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. NS: Intuition (N) / Sensing (S) Accuracy: 85.98% [13:43:48] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. FT: Feeling (F) / Thinking (T) Accuracy: 67.65% [13:43:56] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. JP: Judging (J) / Perceiving (P) Accuracy: 59.93%
# Stocastic Gradient Descent for MBTI dataset
# Individually training each mbti personlity type
for l in range(len(personality_type)):
Y = list_personality[:,l]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
# fit model on training data
model = SGDClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))
IE: Introversion (I) / Extroversion (E) Accuracy: 77.53% NS: Intuition (N) / Sensing (S) Accuracy: 86.21% FT: Feeling (F) / Thinking (T) Accuracy: 69.50% JP: Judging (J) / Perceiving (P) Accuracy: 64.85%
# Logistic Regression for MBTI dataset
# Individually training each mbti personlity type
for l in range(len(personality_type)):
Y = list_personality[:,l]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
# fit model on training data
model = LogisticRegression()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))
IE: Introversion (I) / Extroversion (E) Accuracy: 77.03% NS: Intuition (N) / Sensing (S) Accuracy: 86.21% FT: Feeling (F) / Thinking (T) Accuracy: 71.11% JP: Judging (J) / Perceiving (P) Accuracy: 65.46%
#2 KNN model for MBTI dataset
# Individually training each mbti personlity type
for l in range(len(personality_type)):
Y = list_personality[:,l]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
# fit model on training data
model = KNeighborsClassifier(n_neighbors = 2) # n_neighbors means k
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))
IE: Introversion (I) / Extroversion (E) Accuracy: 76.87% NS: Intuition (N) / Sensing (S) Accuracy: 84.98% FT: Feeling (F) / Thinking (T) Accuracy: 56.78% JP: Judging (J) / Perceiving (P) Accuracy: 44.14%
# SVM model for MBTI dataset
# Individually training each mbti personlity type
for l in range(len(personality_type)):
Y = list_personality[:,l]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
# fit model on training data
model = SVC(random_state = 1)
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))
IE: Introversion (I) / Extroversion (E) Accuracy: 77.53% NS: Intuition (N) / Sensing (S) Accuracy: 86.21% FT: Feeling (F) / Thinking (T) Accuracy: 70.38% JP: Judging (J) / Perceiving (P) Accuracy: 65.39%
# setup parameters for xgboost
param = {}
param['n_estimators'] = 200 #100
param['max_depth'] = 2 #3
param['nthread'] = 8 #1
param['learning_rate'] = 0.2 #0.1
# Individually training each mbti personlity type
for l in range(len(personality_type)):
Y = list_personality[:,l]
# split data into train and test sets
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model on training data
model = XGBClassifier(**param)
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("%s Accuracy: %.2f%%" % (personality_type[l], accuracy * 100.0))
[13:45:22] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. IE: Introversion (I) / Extroversion (E) Accuracy: 77.03% [13:45:28] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. NS: Intuition (N) / Sensing (S) Accuracy: 86.21% [13:45:35] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. FT: Feeling (F) / Thinking (T) Accuracy: 67.69% [13:45:40] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. JP: Judging (J) / Perceiving (P) Accuracy: 64.27%
my_posts = """They act like they care They tell me to share But when I carve the stories on my arm The doctor just calls it self harm I’m not asking for attention There’s a reason I have apprehensions I just need you to see What has become of me||| I know I’m going crazy But they think my thoughts are just hazy When in that chaos, in that confusion I’m crying out for help, to escape my delusions||| Mental health is a state of mind How does one keep that up when assistance is denied All my failed attempts to fight the blaze You treat it like its a passing phase||| Well stop, its not, because mental illness is real Understand that we’re all not made of steel Because when you brush these issues under the carpet You make it seem like its our mistake we’re not guarded||| Don’t you realise that its a problem that needs to be addressed Starting at home, in our nest Why do you keep your mouths shut about such things Instead of caring for those with broken wings||| What use is this social stigma When mental illness is not even such an enigma Look around and you’ll see the numbers of the affected hiding under the covers ||| This is an issue that needs to be discussed Not looked down upon with disgust Mental illness needs to be accepted So that people can be protected ||| Let me give you some direction People need affection The darkness must be escaped Only then the lost can be saved||| Bring in a change Something not very strange The new year is here Its time to eradicate fear||| Recognise the wrists under the knives To stop mental illness from taking more lives Let’s break the convention Start ‘suicide prevention’.||| Hoping the festival of lights drives the darkness of mental illness away """
mydata = pd.DataFrame(data={'type': ['INFJ'], 'posts': [my_posts]})
my_posts, dummy = pre_process_text(mydata, remove_stop_words=True, remove_mbti_profiles=True)
my_X_cnt = cntizer.transform(my_posts)
my_X_tfidf = tfizer.transform(my_X_cnt).toarray()
# setup parameters for xgboost
param = {}
param['n_estimators'] = 200
param['max_depth'] = 2
param['nthread'] = 8
param['learning_rate'] = 0.2
#XGBoost model for MBTI dataset
result = []
# Individually training each mbti personlity type
for l in range(len(personality_type)):
print("%s classifier trained" % (personality_type[l]))
Y = list_personality[:,l]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=7)
# fit model on training data
model = XGBClassifier(**param)
model.fit(X_train, y_train)
# make predictions for my data
y_pred = model.predict(my_X_tfidf)
result.append(y_pred[0])
IE: Introversion (I) / Extroversion (E) classifier trained [13:45:46] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. NS: Intuition (N) / Sensing (S) classifier trained [13:45:52] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. FT: Feeling (F) / Thinking (T) classifier trained [13:45:58] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. JP: Judging (J) / Perceiving (P) classifier trained [13:46:04] WARNING: ..\src\learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
print("The result is: ", translate_back(result))
The result is: INFP